import numpy as np
import pandas as pd
import anndata
import scanpy as sc
import seaborn as sns
import scrublet as scr
sc.settings.verbosity = 1 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=120, color_map='viridis')
sc.logging.print_versions()
fish390_1 = sc.read_10x_mtx(
'/rds/general/user/lb616/projects/popdc1/live/zfish_ven/zfish_ven_cell_ranger_counts/390-1/outs/filtered_feature_bc_matrix', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True) # write a cache file for faster subsequent reading
fish390_1.var_names_make_unique() # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
fish390_1
sc.pp.filter_cells(fish390_1, min_genes=200)
fish390_1
mito_genes = fish390_1.var_names.str.startswith('mt-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
fish390_1.obs['percent_mito'] = np.sum(
fish390_1[:, mito_genes].X, axis=1).A1 / np.sum(fish390_1.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
fish390_1.obs['n_counts'] = fish390_1.X.sum(axis=1).A1
fish390_1.obs['log_counts'] = np.log(fish390_1.obs['n_counts'])
fish390_1.obs['n_genes'] = (fish390_1.X > 0).sum(1).A1
ribo_genes = fish390_1.var_names.str.startswith(('rps','rpl')) # Seaches for all genes that start with 'RPS' or 'RPL'
fish390_1.obs['percent_ribo'] = np.sum(
fish390_1[:, ribo_genes].X, axis=1).A1 / np.sum(fish390_1.X, axis=1).A1
fish390_1
scrub = scr.Scrublet(fish390_1.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
fish390_1.obs['scrublet_score'] = doublet_scores
fish390_1.obs['genotype'] = 'wildtype'
fish390_1.obs['Source'] = 'Nuclei'
fish390_1.obs['Region'] = 'ventricle'
fish390_1.obs['Sample_Number'] = '1'
fish390_1.obs['Sample'] = '390-1'
sc.pl.highest_expr_genes(fish390_1, n_top=20)
sc.pl.violin(fish390_1, ['n_genes', 'n_counts', 'percent_mito','percent_ribo', 'scrublet_score'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(fish390_1, x='n_counts', y='percent_mito')
sc.pl.scatter(fish390_1, x='n_counts', y='n_genes')
fish390_1
fish390_2 = sc.read_10x_mtx(
'/rds/general/user/lb616/projects/popdc1/live/zfish_ven/zfish_ven_cell_ranger_counts/390-2/outs/filtered_feature_bc_matrix', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True) # write a cache file for faster subsequent reading
fish390_2.var_names_make_unique() # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
fish390_2
sc.pp.filter_cells(fish390_2, min_genes=200)
fish390_2
mito_genes = fish390_2.var_names.str.startswith('mt-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
fish390_2.obs['percent_mito'] = np.sum(
fish390_2[:, mito_genes].X, axis=1).A1 / np.sum(fish390_2.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
fish390_2.obs['n_counts'] = fish390_2.X.sum(axis=1).A1
fish390_2.obs['log_counts'] = np.log(fish390_2.obs['n_counts'])
fish390_2.obs['n_genes'] = (fish390_2.X > 0).sum(1).A1
ribo_genes = fish390_2.var_names.str.startswith(('rps','rpl')) # Seaches for all genes that start with 'RPS' or 'RPL'
fish390_2.obs['percent_ribo'] = np.sum(
fish390_2[:, ribo_genes].X, axis=1).A1 / np.sum(fish390_2.X, axis=1).A1
scrub = scr.Scrublet(fish390_2.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
fish390_2.obs['scrublet_score'] = doublet_scores
fish390_2.obs['genotype'] = 'wildtype'
fish390_2.obs['Source'] = 'Nuclei'
fish390_2.obs['Region'] = 'ventricle'
fish390_2.obs['Sample_Number'] = '2'
fish390_2.obs['Sample'] = '390-2'
sc.pl.highest_expr_genes(fish390_2, n_top=20)
sc.pl.violin(fish390_2, ['n_genes', 'n_counts', 'percent_mito','percent_ribo', 'scrublet_score'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(fish390_2, x='n_counts', y='percent_mito')
sc.pl.scatter(fish390_2, x='n_counts', y='n_genes')
fish390_2
fish390_3 = sc.read_10x_mtx(
'/rds/general/user/lb616/projects/popdc1/live/zfish_ven/zfish_ven_cell_ranger_counts/390-3/outs/filtered_feature_bc_matrix', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True) # write a cache file for faster subsequent reading
fish390_3.var_names_make_unique() # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
fish390_3
sc.pp.filter_cells(fish390_3, min_genes=200)
fish390_3
mito_genes = fish390_3.var_names.str.startswith('mt-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
fish390_3.obs['percent_mito'] = np.sum(
fish390_3[:, mito_genes].X, axis=1).A1 / np.sum(fish390_3.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
fish390_3.obs['n_counts'] = fish390_3.X.sum(axis=1).A1
fish390_3.obs['log_counts'] = np.log(fish390_3.obs['n_counts'])
fish390_3.obs['n_genes'] = (fish390_3.X > 0).sum(1).A1
ribo_genes = fish390_3.var_names.str.startswith(('rps','rpl')) # Seaches for all genes that start with 'RPS' or 'RPL'
fish390_3.obs['percent_ribo'] = np.sum(
fish390_3[:, ribo_genes].X, axis=1).A1 / np.sum(fish390_3.X, axis=1).A1
scrub = scr.Scrublet(fish390_3.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
fish390_3.obs['scrublet_score'] = doublet_scores
fish390_3.obs['genotype'] = 'wildtype'
fish390_3.obs['Source'] = 'Nuclei'
fish390_3.obs['Region'] = 'ventricle'
fish390_3.obs['Sample_Number'] = '3'
fish390_3.obs['Sample'] = '390-3'
sc.pl.highest_expr_genes(fish390_3, n_top=20)
sc.pl.violin(fish390_3, ['n_genes', 'n_counts', 'percent_mito','percent_ribo', 'scrublet_score'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(fish390_3, x='n_counts', y='percent_mito')
sc.pl.scatter(fish390_3, x='n_counts', y='n_genes')
fish391_1 = sc.read_10x_mtx(
'/rds/general/user/lb616/projects/popdc1/live/zfish_ven/zfish_ven_cell_ranger_counts/391-1/outs/filtered_feature_bc_matrix', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True) # write a cache file for faster subsequent reading
fish391_1.var_names_make_unique() # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
fish391_1
sc.pp.filter_cells(fish391_1, min_genes=200)
fish391_1
mito_genes = fish391_1.var_names.str.startswith('mt-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
fish391_1.obs['percent_mito'] = np.sum(
fish391_1[:, mito_genes].X, axis=1).A1 / np.sum(fish391_1.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
fish391_1.obs['n_counts'] = fish391_1.X.sum(axis=1).A1
fish391_1.obs['log_counts'] = np.log(fish391_1.obs['n_counts'])
fish391_1.obs['n_genes'] = (fish391_1.X > 0).sum(1).A1
ribo_genes = fish391_1.var_names.str.startswith(('rps','rpl')) # Seaches for all genes that start with 'RPS' or 'RPL'
fish391_1.obs['percent_ribo'] = np.sum(
fish391_1[:, ribo_genes].X, axis=1).A1 / np.sum(fish391_1.X, axis=1).A1
scrub = scr.Scrublet(fish391_1.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
fish391_1.obs['scrublet_score'] = doublet_scores
fish391_1.obs['genotype'] = 'popdc1 null'
fish391_1.obs['Source'] = 'Nuclei'
fish391_1.obs['Region'] = 'ventricle'
fish391_1.obs['Sample_Number'] = '1'
fish391_1.obs['Sample'] = '391-1'
sc.pl.highest_expr_genes(fish391_1, n_top=20)
sc.pl.violin(fish391_1, ['n_genes', 'n_counts', 'percent_mito','percent_ribo', 'scrublet_score'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(fish391_1, x='n_counts', y='percent_mito')
sc.pl.scatter(fish391_1, x='n_counts', y='n_genes')
fish391_1
fish391_2 = sc.read_10x_mtx(
'/rds/general/user/lb616/projects/popdc1/live/zfish_ven/zfish_ven_cell_ranger_counts/391-2/outs/filtered_feature_bc_matrix', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True) # write a cache file for faster subsequent reading
fish391_2.var_names_make_unique() # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
fish391_2
sc.pp.filter_cells(fish391_2, min_genes=200)
fish391_2
mito_genes = fish391_2.var_names.str.startswith('mt-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
fish391_2.obs['percent_mito'] = np.sum(
fish391_2[:, mito_genes].X, axis=1).A1 / np.sum(fish391_2.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
fish391_2.obs['n_counts'] = fish391_2.X.sum(axis=1).A1
fish391_2.obs['log_counts'] = np.log(fish391_2.obs['n_counts'])
fish391_2.obs['n_genes'] = (fish391_2.X > 0).sum(1).A1
ribo_genes = fish391_2.var_names.str.startswith(('rps','rpl')) # Seaches for all genes that start with 'RPS' or 'RPL'
fish391_2.obs['percent_ribo'] = np.sum(
fish391_2[:, ribo_genes].X, axis=1).A1 / np.sum(fish391_2.X, axis=1).A1
scrub = scr.Scrublet(fish391_2.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
fish391_2.obs['scrublet_score'] = doublet_scores
fish391_2.obs['genotype'] = 'popdc1 null'
fish391_2.obs['Source'] = 'Nuclei'
fish391_2.obs['Region'] = 'ventricle'
fish391_2.obs['Sample_Number'] = '2'
fish391_2.obs['Sample'] = '391-2'
sc.pl.highest_expr_genes(fish391_2, n_top=20)
sc.pl.violin(fish391_2, ['n_genes', 'n_counts', 'percent_mito','percent_ribo', 'scrublet_score'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(fish391_2, x='n_counts', y='percent_mito')
sc.pl.scatter(fish391_2, x='n_counts', y='n_genes')
fish391_3 = sc.read_10x_mtx(
'/rds/general/user/lb616/projects/popdc1/live/zfish_ven/zfish_ven_cell_ranger_counts/391-3/outs/filtered_feature_bc_matrix', # the directory with the `.mtx` file
var_names='gene_symbols', # use gene symbols for the variable names (variables-axis index)
cache=True) # write a cache file for faster subsequent reading
fish391_3.var_names_make_unique() # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
fish391_3
sc.pp.filter_cells(fish391_3, min_genes=200)
fish391_3
mito_genes = fish391_3.var_names.str.startswith('mt-') # Seaches for all genes that start with 'MT-'
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
fish391_3.obs['percent_mito'] = np.sum(
fish391_3[:, mito_genes].X, axis=1).A1 / np.sum(fish391_3.X, axis=1).A1
# add the total counts per cell as observations-annotation to adata
fish391_3.obs['n_counts'] = fish391_3.X.sum(axis=1).A1
fish391_3.obs['log_counts'] = np.log(fish391_3.obs['n_counts'])
fish391_3.obs['n_genes'] = (fish391_3.X > 0).sum(1).A1
ribo_genes = fish391_3.var_names.str.startswith(('rps','rpl')) # Seaches for all genes that start with 'RPS' or 'RPL'
fish391_3.obs['percent_ribo'] = np.sum(
fish391_3[:, ribo_genes].X, axis=1).A1 / np.sum(fish391_3.X, axis=1).A1
scrub = scr.Scrublet(fish391_3.X)
doublet_scores, predicted_doublets = scrub.scrub_doublets()
fish391_3.obs['scrublet_score'] = doublet_scores
fish391_3.obs['genotype'] = 'popdc1 null'
fish391_3.obs['Source'] = 'Nuclei'
fish391_3.obs['Region'] = 'ventricle'
fish391_3.obs['Sample_Number'] = '3'
fish391_3.obs['Sample'] = '391-3'
sc.pl.highest_expr_genes(fish391_3, n_top=20)
sc.pl.violin(fish391_3, ['n_genes', 'n_counts', 'percent_mito','percent_ribo', 'scrublet_score'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(fish391_3, x='n_counts', y='percent_mito')
sc.pl.scatter(fish391_3, x='n_counts', y='n_genes')
fish391_3
# You can merge multiple samples at once by adding them at the start of the function. More info can be found here https://anndata.readthedocs.io/en/latest/anndata.AnnData.concatenate.html
# eg. adata_merged = anndata.AnnData.concatenate(sample1, sample2, sample3, sample4, join = 'outer')
fish390_merged = anndata.AnnData.concatenate(fish390_1, fish390_2, fish390_3, join = 'outer')
fish390_merged.obs
fish390_merged
# Save the unfiltered merged anndata object to .h5ad file
fish390_merged.write('./fish390_merged_unfiltered.h5ad')
sc.pl.violin(fish390_merged, ['n_genes', 'n_counts', 'percent_mito', 'scrublet_score'],
jitter=0.1, multi_panel=True)
# You can merge multiple samples at once by adding them at the start of the function. More info can be found here https://anndata.readthedocs.io/en/latest/anndata.AnnData.concatenate.html
# eg. adata_merged = anndata.AnnData.concatenate(sample1, sample2, sample3, sample4, join = 'outer')
fish391_merged = anndata.AnnData.concatenate(fish391_1, fish391_2, fish391_3, join = 'outer')
fish391_merged.obs
fish391_merged
# Save the unfiltered merged anndata object to .h5ad file
fish391_merged.write('./fish391_merged_unfiltered.h5ad')
sc.pl.violin(fish391_merged, ['n_genes', 'n_counts', 'percent_mito', 'scrublet_score'],
jitter=0.1, multi_panel=True)
# You can merge multiple samples at once by adding them at the start of the function. More info can be found here https://anndata.readthedocs.io/en/latest/anndata.AnnData.concatenate.html
# eg. adata_merged = anndata.AnnData.concatenate(sample1, sample2, sample3, sample4, join = 'outer')
fish_ven_merged = anndata.AnnData.concatenate(fish391_1, fish391_2, fish391_3, fish390_1, fish390_2, fish390_3, join = 'outer')
fish_ven_merged.obs
fish_ven_merged
# Save the unfiltered merged anndata object to .h5ad file
fish_ven_merged.write('./fish_ven_merged_unfiltered.h5ad')
sc.pl.violin(fish_ven_merged, ['n_genes', 'n_counts', 'percent_mito', 'scrublet_score'],
jitter=0.1, multi_panel=True)
fish_ven_merged_filtered = fish_ven_merged.copy()
# Filter by UMI counts
sc.pp.filter_cells(fish_ven_merged_filtered, min_counts=500)
sc.pp.filter_cells(fish_ven_merged_filtered, max_counts=7500)
# Filter by number of genes expressed
sc.pp.filter_cells(fish_ven_merged_filtered, min_genes=300)
sc.pp.filter_cells(fish_ven_merged_filtered, max_genes=2500)
# Filter by %mitochondrial genes, %ribosomal genes, and scublet score (doublet probability)
fish_ven_merged_filtered = fish_ven_merged_filtered[fish_ven_merged_filtered.obs.percent_mito < 0.05, :]
fish_ven_merged_filtered = fish_ven_merged_filtered[fish_ven_merged_filtered.obs.percent_ribo < 0.05, :]
fish_ven_merged_filtered = fish_ven_merged_filtered[fish_ven_merged_filtered.obs.scrublet_score < 0.30, :]
# QC plots before filtering
sc.pl.violin(fish_ven_merged, ['n_genes', 'n_counts', 'percent_mito','percent_ribo', 'scrublet_score'],
jitter=0.1, multi_panel=True)
# QC plots after filtering
sc.pl.violin(fish_ven_merged_filtered, ['n_genes', 'n_counts', 'percent_mito','percent_ribo', 'scrublet_score'],
jitter=0.1, multi_panel=True)
print('Number of cells before filtering: ' + str(fish_ven_merged.shape[0]))
print('Number of genes before filtering: ' + str(fish_ven_merged.shape[1]))
print('Number of cells after filtering: ' + str(fish_ven_merged_filtered.shape[0]))
print('Number of genes after filtering: ' + str(fish_ven_merged_filtered.shape[1]))
fish_ven_merged.obs['genotype'].value_counts()
fish_ven_merged_filtered.obs['genotype'].value_counts()
sc.pl.highest_expr_genes(fish_ven_merged, n_top=20)
sc.pl.highest_expr_genes(fish_ven_merged_filtered, n_top=20)
fish_ven_merged_filtered.write('./fish_ven_filtered_089042021.h5ad')